{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "#如果是Keras 的 话， 直接在代码最前面加入这几行代码即可\n",
    "from keras import backend as K\n",
    "# set GPU memory \n",
    "if('tensorflow' == K.backend()):\n",
    "    import tensorflow as tf\n",
    "    from keras.backend.tensorflow_backend import set_session\n",
    "\n",
    "    config = tf.ConfigProto()\n",
    "    config.gpu_options.allow_growth = True\n",
    "    sess = tf.Session(config=config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def extract_text(x):\n",
    "    soup = BeautifulSoup(x, 'html.parser')\n",
    "    return soup.text\n",
    "\n",
    "def clear_text_punctuation(x):\n",
    "    x = html.unescape(x)  # 反转义字符,显示真实内容。\n",
    "    x = HanziConv.toSimplified(x) # 汉字繁体 转成 简体\n",
    "    x = x.strip() # 去除字符首尾的指定字符\n",
    "    x = \"\".join([i for i in x if i not in punctuation]) # 遍历评论每个字符，若是标点，去掉用空字符连接\n",
    "    return re.sub(r'\\s+', ' ', x) # 把x通过正则表达式 (\\s 代表空格，+ 代表多个)，替换成空字符   \n",
    "# 数字和字母过滤方法，及一些特殊符号"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import jieba\n",
    "def jieba_word(x):\n",
    "    return [i for i in jieba.cut(x)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pymysql\n",
    "import pandas as pd\n",
    "import re\n",
    "from bs4 import BeautifulSoup\n",
    "from hanziconv import HanziConv\n",
    "from zhon import hanzi\n",
    "import string\n",
    "punctuation=hanzi.punctuation + string.punctuation\n",
    "punctuation = set([i for i in punctuation])\n",
    "import html\n",
    "from gensim.test.utils import common_texts\n",
    "from gensim.models import Word2Vec\n",
    "from tqdm import tqdm\n",
    "\n",
    "def get_comment_word2vec(sql, vecName):\n",
    "    conn = pymysql.connect(host='127.0.0.1',port=3306,user='root',password='eXYhzAWjyvy8grwM',db='game_source',charset='utf8mb4')\n",
    "    df_comment = pd.read_sql(sql, conn)  # 返回的是 dataFrame 数据结构\n",
    "    conn.close()\n",
    "    \n",
    "    df_comment['clear_content'] = df_comment['content'].apply(clear_text_punctuation)\n",
    "    df_comment['jieba_word'] = df_comment['clear_content'].apply(jieba_word)\n",
    "\n",
    "    words = tqdm([x for x in df_comment['jieba_word']])\n",
    "    model = Word2Vec(words, size=100, window=5, min_count=1, workers=4)\n",
    "    # 取出 wv 属性，是 Word2VecKeyedVectors \n",
    "    # 别直接存储 model, 虽然可以追加，但是追加训练会不包括新词，没处理好该问题\n",
    "    # wv 存储，耗内存少,具体区别 有待进一步体会\n",
    "    word_vectors = model.wv\n",
    "    word_vectors.save(vecName)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache /tmp/jieba.cache\n",
      "Loading model cost 0.605 seconds.\n",
      "Prefix dict has been built succesfully.\n",
      "100%|██████████| 4156105/4156105 [00:09<00:00, 456546.72it/s]\n",
      "100%|██████████| 3324317/3324317 [00:26<00:00, 127748.82it/s]\n"
     ]
    }
   ],
   "source": [
    "# 分别训练 qq_mobile taptap渠道的 word2vec\n",
    "get_comment_word2vec('select content from s_game_comments_qq_game','qq_comments_vector')\n",
    "get_comment_word2vec('select content from s_game_comments_taptap_game','taptap_comments_vector')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 查看 应用宝的评论文本的 模型\n",
    "from gensim.models import KeyedVectors\n",
    "qq_vector = KeyedVectors.load(\"taptap_comments_vector\", mmap='r')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('领主', 0.7517091035842896),\n",
       " ('将军', 0.681869626045227),\n",
       " ('魔王', 0.6727608442306519),\n",
       " ('勇士', 0.655035674571991),\n",
       " ('勇者', 0.6496292352676392),\n",
       " ('信徒', 0.6355671286582947),\n",
       " ('农夫', 0.6272660493850708),\n",
       " ('矿工', 0.6268424391746521),\n",
       " ('族人', 0.6232225894927979),\n",
       " ('恶魔', 0.6221526861190796)]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "qq_vector.most_similar(positive=['男人', '国王'], negative=['女人'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.85668097597747572"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "qq_vector.similarity('非常', '很')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('监察', 0.7858465909957886),\n",
       " ('逃生', 0.7732838988304138),\n",
       " ('大逃杀', 0.6838356852531433),\n",
       " ('逢生', 0.6438950300216675),\n",
       " ('舔食', 0.6280076503753662),\n",
       " ('潜兵', 0.6231508255004883),\n",
       " ('追猎', 0.6077479124069214),\n",
       " ('腐殖', 0.6076853275299072),\n",
       " ('逃杀', 0.6061335802078247),\n",
       " ('岛重', 0.600558876991272)]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "qq_vector.similar_by_word(\"求生\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.78907560602404736"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "qq_vector.n_similarity(['我', '很','喜欢','这个','游戏'], ['这个', '游戏','很','不错','挺','赞'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.19388726693426606"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "qq_vector.distance(\"很\", \"特别\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "memmap([-1.33626497, -4.60259914,  1.01764572, -2.87018847,  1.13950217,\n",
       "       -0.6130681 , -1.50238705,  3.76427627,  0.48720589,  1.36765242,\n",
       "       -1.80725658,  1.06561112,  0.27709526,  0.29756361,  1.0447638 ,\n",
       "       -3.02857852,  0.82067448, -4.27022028, -0.51740491, -1.67667592,\n",
       "       -0.44198424, -2.25331998,  1.43272984, -3.63497663,  4.70034695,\n",
       "       -0.44074306, -0.21645303,  0.12574375,  2.73682618, -1.50603795,\n",
       "       -2.01849079,  3.47720718, -0.67992592,  1.2328881 ,  1.91297317,\n",
       "        3.23571277,  3.15171337,  3.19843817, -3.72199416, -1.44380736,\n",
       "        0.17890073,  1.15622211,  1.15375125, -0.44042143,  0.91262478,\n",
       "       -0.65663034,  4.39379072,  1.57259202,  3.86545992,  0.2163209 ,\n",
       "        4.85433578, -0.2512745 , -0.44280222, -0.44292584, -0.68006843,\n",
       "        0.16226456,  1.41585958,  2.25660038,  0.65347105, -2.99246645,\n",
       "       -4.10421562, -1.71246135,  2.90390587, -4.99090672, -2.73074841,\n",
       "        3.15502572,  0.43288565, -4.90127754,  0.28281939, -0.996454  ,\n",
       "        2.12569594, -1.12192488,  2.28240919,  1.17746818, -0.46081102,\n",
       "       -3.61674523,  0.86282492, -0.67854506, -1.07672358, -0.92105097,\n",
       "       -1.27177954, -2.31233597, -0.12139089,  1.30869043,  2.57475924,\n",
       "       -1.73955345, -0.43533745,  0.55516404,  2.1202364 , -0.16096522,\n",
       "        2.9513514 , -1.95177984, -2.57168865,  1.72101545,  1.58349931,\n",
       "       -0.61853522, -2.71042061, -0.27580285, -2.04253602,  4.8187089 ], dtype=float32)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "qq_vector['求生']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "###################################\n",
    "#如果是Keras 的 话， 直接在代码最前面加入这几行代码即可\n",
    "from keras import backend as K\n",
    "# set GPU memory \n",
    "if('tensorflow' == K.backend()):\n",
    "    import tensorflow as tf\n",
    "    from keras.backend.tensorflow_backend import set_session\n",
    "\n",
    "    config = tf.ConfigProto()\n",
    "    config.gpu_options.allow_growth = True\n",
    "    sess = tf.Session(config=config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source</th>\n",
       "      <th>game_id</th>\n",
       "      <th>game_name</th>\n",
       "      <th>description</th>\n",
       "      <th>category</th>\n",
       "      <th>tags</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>taptap</td>\n",
       "      <td>1</td>\n",
       "      <td>钢琴块2 （别踩白块儿2 ）</td>\n",
       "      <td>&lt;div class=\"body-description-paragraph\" id=\"de...</td>\n",
       "      <td>休闲</td>\n",
       "      <td>手速,反应力,音乐,有毒,单机,速度,钢琴,魔性,节奏,音游,街机,竖屏,休闲</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>taptap</td>\n",
       "      <td>2</td>\n",
       "      <td>部落冲突 (Clash of Clans)</td>\n",
       "      <td>&lt;div class=\"body-description-paragraph\" id=\"de...</td>\n",
       "      <td>策略</td>\n",
       "      <td>免费,策略,塔防,动作,COC,联机,中文,全球同服,经营,联网</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>taptap</td>\n",
       "      <td>3</td>\n",
       "      <td>花花传奇</td>\n",
       "      <td>&lt;div class=\"body-description-paragraph\" id=\"de...</td>\n",
       "      <td>休闲</td>\n",
       "      <td>益智,消除,三消,休闲</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>taptap</td>\n",
       "      <td>4</td>\n",
       "      <td>地铁跑酷</td>\n",
       "      <td>&lt;div class=\"body-description-paragraph\" id=\"de...</td>\n",
       "      <td>冒险</td>\n",
       "      <td>跑酷,单机,免费,动作,冒险,益智,中文,街机,有内购,联网</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>taptap</td>\n",
       "      <td>5</td>\n",
       "      <td>神庙逃亡2</td>\n",
       "      <td>&lt;div class=\"body-description-paragraph\" id=\"de...</td>\n",
       "      <td>动作</td>\n",
       "      <td>冒险,动作,街机,跑酷,单机,竖屏</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   source  game_id              game_name  \\\n",
       "0  taptap        1         钢琴块2 （别踩白块儿2 ）   \n",
       "1  taptap        2  部落冲突 (Clash of Clans)   \n",
       "2  taptap        3                   花花传奇   \n",
       "3  taptap        4                   地铁跑酷   \n",
       "4  taptap        5                  神庙逃亡2   \n",
       "\n",
       "                                         description category  \\\n",
       "0  <div class=\"body-description-paragraph\" id=\"de...       休闲   \n",
       "1  <div class=\"body-description-paragraph\" id=\"de...       策略   \n",
       "2  <div class=\"body-description-paragraph\" id=\"de...       休闲   \n",
       "3  <div class=\"body-description-paragraph\" id=\"de...       冒险   \n",
       "4  <div class=\"body-description-paragraph\" id=\"de...       动作   \n",
       "\n",
       "                                      tags  \n",
       "0  手速,反应力,音乐,有毒,单机,速度,钢琴,魔性,节奏,音游,街机,竖屏,休闲  \n",
       "1         免费,策略,塔防,动作,COC,联机,中文,全球同服,经营,联网  \n",
       "2                              益智,消除,三消,休闲  \n",
       "3           跑酷,单机,免费,动作,冒险,益智,中文,街机,有内购,联网  \n",
       "4                        冒险,动作,街机,跑酷,单机,竖屏  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pymysql\n",
    "import pandas as pd\n",
    "import re\n",
    "from bs4 import BeautifulSoup\n",
    "from hanziconv import HanziConv\n",
    "from zhon import hanzi\n",
    "import string\n",
    "punctuation=hanzi.punctuation + string.punctuation \n",
    "import html\n",
    "conn = pymysql.connect(host='127.0.0.1',port=3306,user='root',password='eXYhzAWjyvy8grwM',db='game_source',charset='utf8mb4')\n",
    "sql = \"SELECT source,game_id, game_name, description, category, tags FROM s_game_detail where source='taptap'\"\n",
    "df_detail = pd.read_sql(sql, conn)  # 返回的是 dataFrame 数据结构\n",
    "conn.close()\n",
    "df_detail.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def is_chinese(uchar):\n",
    "        \"\"\"判断一个unicode是否是汉字\"\"\"\n",
    "        if uchar >= u'\\u4e00' and uchar<=u'\\u9fa5':\n",
    "                return True\n",
    "        else:\n",
    "                return False\n",
    "# 判断描述是否 中文字符占一半\n",
    "def get_chinese_sentence(x):\n",
    "    l = len(x)+1\n",
    "    x = \"\".join([i for i in x if is_chinese(i)])\n",
    "    return True if len(x)/l >0.8 else False      "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1. 提取div id  description  text  2.去除 text里面的 html标签 \n",
    "punctuation = punctuation+ '●™★1234567890 '\n",
    "punctuation_new = set([i for i in punctuation])\n",
    "def get_text(description):\n",
    "    soup = BeautifulSoup(description, 'html.parser')\n",
    "    try :\n",
    "        x = soup.find_all(\"div\", attrs={\"id\": \"description\"})[0].get_text()\n",
    "    except:\n",
    "        return ''\n",
    "    x = html.unescape(x)  # 反转义字符,显示真实内容。\n",
    "    x = x.replace('\\n', '')\n",
    "    x = HanziConv.toSimplified(x)  # 汉字繁体 转成 简体\n",
    "    x = \"\".join([i for i in x if i not in punctuation_new]) # 遍历评论每个字符，若是标点，去掉用空字符连接\n",
    "    x = re.sub(r'[A-Za-z]+', '', x) \n",
    "    return x if get_chinese_sentence(x) else ' '"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import jieba\n",
    "def jieba_word(x):\n",
    "    return [i for i in jieba.cut(x)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache /tmp/jieba.cache\n",
      "Loading model cost 0.602 seconds.\n",
      "Prefix dict has been built succesfully.\n"
     ]
    }
   ],
   "source": [
    "df_detail = df_detail[df_detail['description']!=' ']\n",
    "df_detail['clear_desc'] = df_detail['description'].apply(get_text)\n",
    "df_detail = df_detail[df_detail['clear_desc']!=' ']\n",
    "df_detail.head()\n",
    "df_detail['jieba_word'] = df_detail['clear_desc'].apply(jieba_word)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "from gensim.test.utils import common_texts\n",
    "from gensim.models import Word2Vec\n",
    "from tqdm import tqdm\n",
    "def get_word(df_comment, vecName):\n",
    "    words = tqdm([x for x in df_comment['jieba_word']])\n",
    "    model = Word2Vec(words, size=100, window=5, min_count=1, workers=4)\n",
    "    # 取出 wv 属性，是 Word2VecKeyedVectors \n",
    "    # 别直接存储 model, 虽然可以追加，但是追加训练会不包括新词，没处理好该问题\n",
    "    # wv 存储，耗内存少,具体区别 有待进一步体会\n",
    "    word_vectors = model.wv\n",
    "    word_vectors.save(vecName)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 26518/26518 [00:00<00:00, 30931.93it/s]\n"
     ]
    }
   ],
   "source": [
    "get_word(df_detail, 'taptap_description_vector')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
